<!DOCTYPE html>
<html lang="zh-CN">
    <head>
        <meta charset="utf-8">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="robots" content="noodp" />
        <meta http-equiv="X-UA-Compatible" content="IE=edge, chrome=1">
        <title>Prometheus监控规则说明 - 德国粗茶淡饭</title><meta name="Description" content="Prometheus监控规则说明"><meta property="og:title" content="Prometheus监控规则说明" />
<meta property="og:description" content="Prometheus监控规则说明" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://www.ctq6.cn/technology/prometheus/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E/" />
<meta property="og:image" content="https://www.ctq6.cn/logo.png"/>
<meta property="article:published_time" content="2021-02-16T19:21:56+08:00" />
<meta property="article:modified_time" content="2021-02-16T19:21:56+08:00" />
<meta name="twitter:card" content="summary_large_image"/>
<meta name="twitter:image" content="https://www.ctq6.cn/logo.png"/>

<meta name="twitter:title" content="Prometheus监控规则说明"/>
<meta name="twitter:description" content="Prometheus监控规则说明"/>
<meta name="application-name" content="LoveIt">
<meta name="apple-mobile-web-app-title" content="LoveIt"><meta name="theme-color" content="#ffffff"><meta name="msapplication-TileColor" content="#da532c"><link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" />
        <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
        <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png"><link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png"><link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5"><link rel="manifest" href="/site.webmanifest"><link rel="canonical" href="https://www.ctq6.cn/technology/prometheus/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E/" /><link rel="prev" href="https://www.ctq6.cn/technology/prometheus/prometheus%E7%9B%91%E6%8E%A7k8s%E9%9B%86%E7%BE%A4%E7%BB%84%E4%BB%B6/" /><link rel="next" href="https://www.ctq6.cn/technology/prometheus/prometheus_operator%E4%BD%BF%E7%94%A8/" /><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/normalize.css@8.0.1/normalize.min.css"><link rel="stylesheet" href="/css/style.min.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.13.0/css/all.min.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@3.7.2/animate.min.css"><script type="application/ld+json">
    {
        "@context": "http://schema.org",
        "@type": "BlogPosting",
        "headline": "Prometheus监控规则说明",
        "inLanguage": "zh-CN",
        "mainEntityOfPage": {
            "@type": "WebPage",
            "@id": "https:\/\/www.ctq6.cn\/technology\/prometheus\/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E\/"
        },"image": [{
                            "@type": "ImageObject",
                            "url": "https:\/\/www.ctq6.cn\/images\/Apple-Devices-Preview.png",
                            "width":  3200 ,
                            "height":  2048 
                        }],"genre": "technology","keywords": "Promethues","wordcount":  5955 ,
        "url": "https:\/\/www.ctq6.cn\/technology\/prometheus\/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E\/","datePublished": "2021-02-16T19:21:56+08:00","dateModified": "2021-02-16T19:21:56+08:00","license": "This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.","publisher": {
            "@type": "Organization",
            "name": "xxxx","logo": {
                    "@type": "ImageObject",
                    "url": "https:\/\/www.ctq6.cn\/images\/avatar.png",
                    "width":  528 ,
                    "height":  560 
                }},"author": {
                "@type": "Person",
                "name": "MikelPan"
            },"description": "Prometheus监控规则说明"
    }
    </script></head>
    <body header-desktop="fixed" header-mobile="auto"><script type="text/javascript">(window.localStorage && localStorage.getItem('theme') ? localStorage.getItem('theme') === 'dark' : ('auto' === 'auto' ? window.matchMedia('(prefers-color-scheme: dark)').matches : 'auto' === 'dark')) && document.body.setAttribute('theme', 'dark');</script>

        <div id="mask"></div><div class="wrapper"><header class="desktop" id="header-desktop">
    <div class="header-wrapper">
        <div class="header-title">
            <a href="/" title="德国粗茶淡饭"><span class="header-title-pre"><i class='far fa-kiss-wink-heart fa-fw'></i></span>德国粗茶淡饭</a>
        </div>
        <div class="menu">
            <div class="menu-inner"><a class="menu-item" href="/posts/"> 生活 </a><a class="menu-item" href="/technology/"> 技术 </a><a class="menu-item" href="/tags/"> 标签 </a><a class="menu-item" href="/categories/"> 分类 </a><a class="menu-item" href="/categories/documentation/"> 文档 </a><a class="menu-item" href="/about/"> 关于 </a><a class="menu-item" href="https://github.com/MikelPan/Cnblog.git" title="GitHub" rel="noopener noreffer" target="_blank"><i class='fab fa-github fa-fw'></i>  </a><span class="menu-item delimiter"></span><a href="javascript:void(0);" class="menu-item language" title="选择语言">简体中文<i class="fas fa-chevron-right fa-fw"></i>
                        <select class="language-select" id="language-select-desktop" onchange="location = this.value;"><option value="/technology/prometheus/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E/" selected>简体中文</option></select>
                    </a><span class="menu-item search" id="search-desktop">
                        <input type="text" placeholder="搜索文章标题或内容..." id="search-input-desktop">
                        <a href="javascript:void(0);" class="search-button search-toggle" id="search-toggle-desktop" title="搜索">
                            <i class="fas fa-search fa-fw"></i>
                        </a>
                        <a href="javascript:void(0);" class="search-button search-clear" id="search-clear-desktop" title="清空">
                            <i class="fas fa-times-circle fa-fw"></i>
                        </a>
                        <span class="search-button search-loading" id="search-loading-desktop">
                            <i class="fas fa-spinner fa-fw fa-spin"></i>
                        </span>
                    </span><a href="javascript:void(0);" class="menu-item theme-switch" title="切换主题">
                    <i class="fas fa-adjust fa-fw"></i>
                </a>
            </div>
        </div>
    </div>
</header><header class="mobile" id="header-mobile">
    <div class="header-container">
        <div class="header-wrapper">
            <div class="header-title">
                <a href="/" title="德国粗茶淡饭"><span class="header-title-pre"><i class='far fa-kiss-wink-heart fa-fw'></i></span>德国粗茶淡饭</a>
            </div>
            <div class="menu-toggle" id="menu-toggle-mobile">
                <span></span><span></span><span></span>
            </div>
        </div>
        <div class="menu" id="menu-mobile"><div class="search-wrapper">
                    <div class="search mobile" id="search-mobile">
                        <input type="text" placeholder="搜索文章标题或内容..." id="search-input-mobile">
                        <a href="javascript:void(0);" class="search-button search-toggle" id="search-toggle-mobile" title="搜索">
                            <i class="fas fa-search fa-fw"></i>
                        </a>
                        <a href="javascript:void(0);" class="search-button search-clear" id="search-clear-mobile" title="清空">
                            <i class="fas fa-times-circle fa-fw"></i>
                        </a>
                        <span class="search-button search-loading" id="search-loading-mobile">
                            <i class="fas fa-spinner fa-fw fa-spin"></i>
                        </span>
                    </div>
                    <a href="javascript:void(0);" class="search-cancel" id="search-cancel-mobile">
                        取消
                    </a>
                </div><a class="menu-item" href="/posts/" title="">生活</a><a class="menu-item" href="/technology/" title="">技术</a><a class="menu-item" href="/tags/" title="">标签</a><a class="menu-item" href="/categories/" title="">分类</a><a class="menu-item" href="/categories/documentation/" title="">文档</a><a class="menu-item" href="/about/" title="">关于</a><a class="menu-item" href="https://github.com/MikelPan/Cnblog.git" title="GitHub" rel="noopener noreffer" target="_blank"><i class='fab fa-github fa-fw'></i></a><a href="javascript:void(0);" class="menu-item theme-switch" title="切换主题">
                <i class="fas fa-adjust fa-fw"></i>
            </a><a href="javascript:void(0);" class="menu-item" title="选择语言">简体中文<i class="fas fa-chevron-right fa-fw"></i>
                    <select class="language-select" onchange="location = this.value;"><option value="/technology/prometheus/prometheus%E7%9B%91%E6%8E%A7%E8%A7%84%E5%88%99%E8%AF%B4%E6%98%8E/" selected>简体中文</option></select>
                </a></div>
    </div>
</header>
<div class="search-dropdown desktop">
    <div id="search-dropdown-desktop"></div>
</div>
<div class="search-dropdown mobile">
    <div id="search-dropdown-mobile"></div>
</div>
<main class="main">
                <div class="container"><div class="page single special"><h1 class="single-title animated pulse faster">Prometheus监控规则说明</h1><div class="content" id="content"><h2 id="prometheus-operator-监控指标">prometheus operator 监控指标</h2>
<h3 id="kubernetes-资源相关">kubernetes 资源相关</h3>
<p><strong>CPUThrottlingHigh</strong></p>
<p>关于 CPU 的 limit 合理性指标。查出最近5分钟，超过25%的 CPU 执行周期受到限制的容器。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>increase<span class="o">(</span>container_cpu_cfs_throttled_periods_total<span class="o">{</span>container!<span class="o">=</span><span class="s2">&#34;&#34;</span>, <span class="o">}[</span>5m<span class="o">]))</span> by <span class="o">(</span>container, pod, namespace<span class="o">)</span>
          /
sum<span class="o">(</span>increase<span class="o">(</span>container_cpu_cfs_periods_total<span class="o">{}[</span>5m<span class="o">]))</span> by <span class="o">(</span>container, pod, namespace<span class="o">)</span>
          &gt; <span class="o">(</span> <span class="m">25</span> / <span class="m">100</span> <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>container_cpu_cfs_periods_total</code>：容器生命周期中度过的 cpu 周期总数</li>
<li><code>container_cpu_cfs_throttled_periods_total</code>：容器生命周期中度过的受限的 cpu 周期总数</li>
</ul>
<p><strong>KubeCPUOvercommit</strong></p>
<p>集群 CPU 过度使用。CPU 已经过度使用无法容忍节点故障，节点资源使用的总量超过节点的 CPU 总量，所以如果有节点故障将影响集群资源运行因为所需资源将无法被分配。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>namespace:kube_pod_container_resource_requests_cpu_cores:sum<span class="o">{})</span>
          /
sum<span class="o">(</span>kube_node_status_allocatable_cpu_cores<span class="o">)</span>
          &gt;
<span class="o">(</span>count<span class="o">(</span>kube_node_status_allocatable_cpu_cores<span class="o">)</span>-1<span class="o">)</span> / count<span class="o">(</span>kube_node_status_allocatable_cpu_cores<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kube_pod_container_resource_requests_cpu_cores</code>：资源 CPU 使用的 cores 数量</li>
<li><code>kube_node_status_allocatable_cpu_cores</code>：节点 CPU cores 数量</li>
</ul>
<p><strong>KubeMemoryOvercommit</strong></p>
<p>集群内存过度使用。内存已经过度使用无法容忍节点故障，节点资源使用的总量超过节点的内存总量，所以如果有节点故障将影响集群资源运行因为所需资源将无法被分配。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span><span class="lnt">6
</span><span class="lnt">7
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>namespace:kube_pod_container_resource_requests_memory_bytes:sum<span class="o">{})</span>
          /
        sum<span class="o">(</span>kube_node_status_allocatable_memory_bytes<span class="o">)</span>
          &gt;
        <span class="o">(</span>count<span class="o">(</span>kube_node_status_allocatable_memory_bytes<span class="o">)</span>-1<span class="o">)</span>
          /
        count<span class="o">(</span>kube_node_status_allocatable_memory_bytes<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kube_pod_container_resource_requests_memory_bytes</code>：资源内存使用的量</li>
<li><code>kube_node_status_allocatable_memory_bytes</code>：节点内存量</li>
</ul>
<p><strong>KubeCPUQuotaOvercommit</strong></p>
<p>集群CPU是否超分。查看 CPU 资源分配的额度是否超过进群总额度 表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>kube_pod_container_resource_limits_cpu_cores<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span>
          /
        sum<span class="o">(</span>kube_node_status_allocatable_cpu_cores<span class="o">)</span>
          &gt; 1.1
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kube_pod_container_resource_limits_cpu_cores</code>：资源分配的 CPU 资源额度</li>
<li><code>kube_node_status_allocatable_cpu_cores</code>：节点 CPU 总量</li>
</ul>
<p><strong>KubeMemoryQuotaOvercommit</strong></p>
<p>集群超分内存，查看内存资源分配的额度是否超过进群总额度 表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>kube_pod_container_resource_limits_memory_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span>
          /
        sum<span class="o">(</span>kube_node_status_allocatable_memory_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span>
          &gt; 1.1
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li><code>kube_pod_container_resource_limits_memory_bytes</code>：资源配额内存量</li>
<li><code>kube_node_status_allocatable_memory_bytes</code>：节点内存量</li>
</ul>
<p><strong>KubeMEMQuotaExceeded</strong></p>
<p>命名空间级内存资源使用的比例，关乎资源配额。当使用 request 和 limit 限制资源时，使用值和最大值还是有一点区别，当有 request 时说明最低分配了这么多资源。需要注意当 request 等于 limit 时那么说明资源已经是100%已经分配使用当监控告警发出的时候需要区分。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum <span class="o">(</span>kube_pod_container_resource_requests_memory_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> <span class="o">)</span> by <span class="o">(</span>namespace<span class="o">)</span>/ <span class="o">(</span>sum<span class="o">(</span>kube_pod_container_resource_limits_memory_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span> by <span class="o">(</span>namespace<span class="o">))</span> &gt; 0.8
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li><code>kube_pod_container_resource_requests_memory_bytes</code>：内存资源使用量</li>
<li><code>kube_pod_container_resource_limits_memory_bytes</code>：内存资源最大值</li>
</ul>
<p><strong>KubeCPUQuotaExceeded</strong></p>
<p>命名空间级 CPU 资源使用的比例，关乎资源配额。当使用 request 和 limit 限制资源时，使用值和最大值还是有一点区别，当有 request 时说明最低分配了这么多资源。需要注意当 request 等于 limit 时那么说明资源已经是100%已经分配使用当监控告警发出的时候需要区分。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum <span class="o">(</span>kube_pod_container_resource_requests_cpu_cores<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> <span class="o">)</span> by <span class="o">(</span>namespace<span class="o">)</span>/ <span class="o">(</span>sum<span class="o">(</span>kube_pod_container_resource_limits_cpu_cores<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span> by <span class="o">(</span>namespace<span class="o">))</span> &gt; 0.8
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li><code>kube_pod_container_resource_requests_cpu_cores</code>：CPU 使用量</li>
<li><code>kube_pod_container_resource_limits_cpu_cores</code>：CPU 限额最大值</li>
</ul>
<h3 id="kubernetes-存储相关">Kubernetes 存储相关</h3>
<p><strong>KubePersistentVolumeFillingUp</strong></p>
<p>PVC 容量监控，表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kubelet_volume_stats_available_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}</span>
          /
        kubelet_volume_stats_capacity_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}</span>
          &lt; 0.3
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kubelet_volume_stats_available_bytes</code>：剩余空间</li>
<li><code>kubelet_volume_stats_capacity_bytes</code>：空间总量</li>
</ul>
<p><strong>KubePersistentVolumeFillingUp</strong></p>
<p>磁盘空间耗尽预测：通过PVC资源使用6小时变化率预测 接下来4天的磁盘使用率,表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span><span class="lnt">6
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>kubelet_volume_stats_available_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}</span>
            /
          kubelet_volume_stats_capacity_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}</span>
        <span class="o">)</span> &lt; 0.4
        and
        predict_linear<span class="o">(</span>kubelet_volume_stats_available_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}[</span>6h<span class="o">]</span>, <span class="m">4</span> * <span class="m">24</span> * 3600<span class="o">)</span> &lt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li><code>kubelet_volume_stats_available_bytes</code>：剩余空间</li>
<li><code>kubelet_volume_stats_capacity_bytes</code>：空间总量</li>
</ul>
<p><strong>KubePersistentVolumeErrors</strong></p>
<p>PV 使用状态监控。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_persistentvolume_status_phase<span class="o">{</span><span class="nv">phase</span><span class="o">=</span>~<span class="s2">&#34;Failed|Pending&#34;</span>,job<span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kube_persistentvolume_status_phase</code>：PV 使用状态</li>
</ul>
<h3 id="kubernetes-system-相关">kubernetes system 相关</h3>
<p><strong>KubeVersionMismatch</strong></p>
<p>组件版本与当前集群版本是否有差异。对比组件版本是否有差异，默认为1 。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">count<span class="o">(</span>count by <span class="o">(</span>gitVersion<span class="o">)</span> <span class="o">(</span>label_replace<span class="o">(</span>kubernetes_build_info<span class="o">{</span>job!~<span class="s2">&#34;kube-dns|coredns&#34;</span><span class="o">}</span>,<span class="s2">&#34;gitVersion&#34;</span>,<span class="s2">&#34;</span><span class="nv">$1</span><span class="s2">&#34;</span>,<span class="s2">&#34;gitVersion&#34;</span>,<span class="s2">&#34;(v[0-9]*.[0-9]*.[0-9]*).*&#34;</span><span class="o">)))</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kubernetes_build_info</code>：获取组件信息</li>
</ul>
<p><strong>KubeClientErrors</strong></p>
<p>客户端访问某些接口的错误率。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>sum<span class="o">(</span>rate<span class="o">(</span>rest_client_requests_total<span class="o">{</span><span class="nv">code</span><span class="o">=</span>~<span class="s2">&#34;5..&#34;</span><span class="o">}[</span>5m<span class="o">]))</span> by <span class="o">(</span>instance, job<span class="o">)</span>
          /
        sum<span class="o">(</span>rate<span class="o">(</span>rest_client_requests_total<span class="o">[</span>5m<span class="o">]))</span> by <span class="o">(</span>instance, job<span class="o">))</span>
        &gt; 0.01
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>rest_client_requests_total</code>：状态码</li>
</ul>
<h3 id="apiserver-相关">APIServer 相关</h3>
<p><strong>KubeAPIErrorsHigh</strong></p>
<p>APIServer 请求错误率。5分钟内 APIServer 请求错误率。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>rate<span class="o">(</span>apiserver_request_total<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span>,code<span class="o">=</span>~<span class="s2">&#34;5..&#34;</span><span class="o">}[</span>5m<span class="o">]))</span> by <span class="o">(</span>resource,subresource,verb<span class="o">)</span>
          /
        sum<span class="o">(</span>rate<span class="o">(</span>apiserver_request_total<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}[</span>5m<span class="o">]))</span> by <span class="o">(</span>resource,subresource,verb<span class="o">)</span> &gt; 0.05
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>apiserver_request_total：APIServer</code> 请求数</li>
</ul>
<p><strong>KubeClientCertificateExpiration</strong></p>
<p>kubelet 客户端证书过期。监测证书状态30天告警和7天告警。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="c1"># 30天</span>
apiserver_client_certificate_expiration_seconds_count<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}</span> &gt; <span class="m">0</span> and on<span class="o">(</span>job<span class="o">)</span> histogram_quantile<span class="o">(</span>0.01, sum by <span class="o">(</span>job, le<span class="o">)</span> <span class="o">(</span>rate<span class="o">(</span>apiserver_client_certificate_expiration_seconds_bucket<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}[</span>5m<span class="o">])))</span> &lt; <span class="m">2592000</span>
<span class="c1"># 7天</span>
apiserver_client_certificate_expiration_seconds_count<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}</span> &gt; <span class="m">0</span> and on<span class="o">(</span>job<span class="o">)</span> histogram_quantile<span class="o">(</span>0.01, sum by <span class="o">(</span>job, le<span class="o">)</span> <span class="o">(</span>rate<span class="o">(</span>apiserver_client_certificate_expiration_seconds_bucket<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}[</span>5m<span class="o">])))</span> &lt; <span class="m">604800</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>apiserver_client_certificate_expiration_seconds_count</code>：证书有效剩余时间</li>
</ul>
<p><strong>AggregatedAPIErrors</strong></p>
<p>自定义注册的 APIServer 服务可用性监控，当检测到自定义注册的 APIServer 五分钟不用次数达到2次。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum by<span class="o">(</span>name, namespace<span class="o">)(</span>increase<span class="o">(</span>aggregator_unavailable_apiservice_count<span class="o">[</span>5m<span class="o">]))</span> &gt; <span class="m">2</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li><code>aggregator_unavailable_apiservice_count：</code>监测自定义注册的 APIService 不可用次数。</li>
</ul>
<p><strong>KubeAPIDown</strong></p>
<p>APIserver 失联，监控 APIServer 服务，失联原因可能是服务 down 还可能是网络出现状况。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">absent<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;apiserver&#34;</span><span class="o">}</span> <span class="o">==</span> 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="kubelet-相关">kubelet 相关</h3>
<p><strong>KubeNodeNotReady</strong></p>
<p>节点是否处于就绪状态。检测节点是否为就绪状态，或者可能是 kubelet 服务down 了。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_node_status_condition<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span>,condition<span class="o">=</span><span class="s2">&#34;Ready&#34;</span>,status<span class="o">=</span><span class="s2">&#34;true&#34;</span><span class="o">}</span> <span class="o">==</span> <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kube_node_status_condition</code>：节点状态监测</li>
</ul>
<p><strong>KubeNodeUnreachable</strong></p>
<p>节点状态为 Unreachable。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_node_spec_unschedulable<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> <span class="o">==</span> <span class="m">1</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>KubeletTooManyPods</strong></p>
<p>节点运行过多的 Pod，监测节点上运行的 Pods 数量。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">max<span class="o">(</span>max<span class="o">(</span>kubelet_running_pod_count<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">})</span> by<span class="o">(</span>instance<span class="o">)</span> * on<span class="o">(</span>instance<span class="o">)</span> group_left<span class="o">(</span>node<span class="o">)</span> kubelet_node_name<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">})</span> by<span class="o">(</span>node<span class="o">)</span> / max<span class="o">(</span>kube_node_status_capacity_pods<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> !<span class="o">=</span> 1<span class="o">)</span> by<span class="o">(</span>node<span class="o">)</span> &gt; 0.95
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li><code>kubelet_running_pod_count</code>：节点运行的 Pods 数量</li>
<li><code>kubelet_node_name</code>：节点名称</li>
<li><code>kube_node_status_capacity_pods</code>：节点可运行的最大 Pod 数量</li>
</ul>
<p><strong>KubeNodeReadinessFlapping</strong></p>
<p>监测集群状态，查看集群内节点状态改变的频率。表达式:</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>changes<span class="o">(</span>kube_node_status_condition<span class="o">{</span><span class="nv">status</span><span class="o">=</span><span class="s2">&#34;true&#34;</span>,condition<span class="o">=</span><span class="s2">&#34;Ready&#34;</span><span class="o">}[</span>15m<span class="o">]))</span> by <span class="o">(</span>node<span class="o">)</span> &gt; <span class="m">2</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>KubeletDown</strong></p>
<p>监控 kubelet 服务，down 或者网络出现问题。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">absent<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kubelet&#34;</span>, <span class="nv">metrics_path</span><span class="o">=</span><span class="s2">&#34;/metrics&#34;</span><span class="o">}</span> <span class="o">==</span> 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="集群组件">集群组件</h3>
<p><strong>KubeSchedulerDown</strong></p>
<p>KubeScheduler 失联，监测 KubeScheduler 是否正常。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">absent<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-scheduler&#34;</span><span class="o">}</span> <span class="o">==</span> 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>KubeControllerManagerDown</strong></p>
<p>监测 KubeControllerManager 服务，Down 或者网络不通。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">absent<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-controller-manager&#34;</span><span class="o">}</span> <span class="o">==</span> 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="应用相关">应用相关</h3>
<p><strong>KubePodCrashLooping</strong></p>
<p>Pod 重启时间，重启时间超过3m告警。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">rate<span class="o">(</span>kube_pod_container_status_restarts_total<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}[</span>5m<span class="o">])</span> * <span class="m">60</span> * <span class="m">3</span> &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标:</p>
<ul>
<li>kube_pod_container_status_restarts_total：重启状态0为正常</li>
</ul>
<p><strong>KubePodNotReady</strong></p>
<p>Pods 没有就绪，检测 Pod 是否就绪。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum by <span class="o">(</span>namespace, pod<span class="o">)</span> <span class="o">(</span>max by<span class="o">(</span>namespace, pod<span class="o">)</span> <span class="o">(</span>kube_pod_status_phase<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span>, <span class="nv">phase</span><span class="o">=</span>~<span class="s2">&#34;Pending|Unknown&#34;</span><span class="o">})</span> * on<span class="o">(</span>namespace, pod<span class="o">)</span> group_left<span class="o">(</span>owner_kind<span class="o">)</span> max by<span class="o">(</span>namespace, pod, owner_kind<span class="o">)</span> <span class="o">(</span>kube_pod_owner<span class="o">{</span>owner_kind!<span class="o">=</span><span class="s2">&#34;Job&#34;</span><span class="o">}))</span> &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_pod_status_phase：Pod 状态</li>
</ul>
<p><strong>KubeDeploymentGenerationMismatch</strong></p>
<p>Deployment 部署失败，Deployment 生成的资源与定义的资源不匹配。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_deployment_status_observed_generation<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
          !<span class="o">=</span>
        kube_deployment_metadata_generation<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_deployment_status_observed_generation：Deployment 生成资源数</li>
<li>kube_deployment_metadata_generation：Deployment 定义资源数</li>
</ul>
<p><strong>KubeDeploymentReplicasMismatch</strong></p>
<p>查看 Deplyment 副本是否达到预期。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span><span class="lnt">6
</span><span class="lnt">7
</span><span class="lnt">8
</span><span class="lnt">9
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>
          kube_deployment_spec_replicas<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
            !<span class="o">=</span>
          kube_deployment_status_replicas_available<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
        <span class="o">)</span> and <span class="o">(</span>
          changes<span class="o">(</span>kube_deployment_status_replicas_updated<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}[</span>3m<span class="o">])</span>
            <span class="o">==</span>
          <span class="m">0</span>
        <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_deployment_spec_replicas                     资源定义副本数</li>
<li>kube_deployment_status_replicas_available         正在运行副本数</li>
<li>kube_deployment_status_replicas_updated           更新的副本数</li>
</ul>
<p><strong>KubeStatefulSetReplicasMismatch</strong></p>
<p>监测 StatefulSet 副本是否达到预期。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span><span class="lnt">6
</span><span class="lnt">7
</span><span class="lnt">8
</span><span class="lnt">9
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>
          kube_statefulset_status_replicas_ready<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
            !<span class="o">=</span>
          kube_statefulset_status_replicas<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
        <span class="o">)</span> and <span class="o">(</span>
          changes<span class="o">(</span>kube_statefulset_status_replicas_updated<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}[</span>5m<span class="o">])</span>
            <span class="o">==</span>
          <span class="m">0</span>
        <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_statefulset_status_replicas_ready：就绪副本数</li>
<li>kube_statefulset_status_replicas：当前副本数</li>
<li>kube_statefulset_status_replicas_updated：更新的副本数</li>
</ul>
<p><strong>KubeStatefulSetUpdateNotRolledOut</strong></p>
<p>StatefulSet  更新失败且未回滚，对比版本号和副本数。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt"> 1
</span><span class="lnt"> 2
</span><span class="lnt"> 3
</span><span class="lnt"> 4
</span><span class="lnt"> 5
</span><span class="lnt"> 6
</span><span class="lnt"> 7
</span><span class="lnt"> 8
</span><span class="lnt"> 9
</span><span class="lnt">10
</span><span class="lnt">11
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">max without <span class="o">(</span>revision<span class="o">)</span> <span class="o">(</span>
          kube_statefulset_status_current_revision<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
            unless
          kube_statefulset_status_update_revision<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
        <span class="o">)</span>
          *
        <span class="o">(</span>
          kube_statefulset_replicas<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
            !<span class="o">=</span>
          kube_statefulset_status_replicas_updated<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
        <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_statefulset_status_replicas：每个 StatefulSet 的副本数。</li>
<li>kube_statefulset_status_replicas_current：每个 StatefulSet 的当前副本数。</li>
<li>kube_statefulset_status_replicas_ready：每个StatefulSet 的就绪副本数。</li>
<li>kube_statefulset_status_replicas_updated：每个StatefulSet 的更新副本数。</li>
<li>kube_statefulset_status_observed_generation：StatefulSet 控制器观察到的生成。</li>
<li>kube_statefulset_replicas：StatefulSet 所需的副本数。</li>
<li>kube_statefulset_metadata_generation：表示 StatefulSet 所需状态的特定生成的序列号。</li>
<li>kube_statefulset_created：创建时间戳。</li>
<li>kube_statefulset_labels：Kubernetes 标签转换为 Prometheus 标签。</li>
<li>kube_statefulset_status_current_revision：指示用于按顺序(0，currentReplicas)生成 Pod 的StatefulSet 的版本。</li>
<li>kube_statefulset_status_update_revision：指示用于按顺序 [replicas-updatedReplicas，replicas] 生成 Pod 的 StatefulSet 的版本。</li>
</ul>
<p><strong>KubeDaemonSetRolloutStuck</strong></p>
<p>监测 DaemonSet 是否处于就绪状态。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_daemonset_status_number_ready<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span>
          /
        kube_daemonset_status_desired_number_scheduled<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> &lt; 1.00
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_daemonset_status_number_ready：就绪的 DaemonSet</li>
<li>kube_daemonset_status_desired_number_scheduled：应该调度的 DaemonSet 数量</li>
</ul>
<p><strong>KubeDaemonSetMisScheduled</strong></p>
<p>DaemonSet  运行在不该运行的节点上面。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">kube_daemonset_status_number_misscheduled<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">}</span> &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_daemonset_status_number_misscheduled：运行在不该运行的节点状态</li>
</ul>
<p><strong>KubeDaemonSetNotScheduled</strong></p>
<p>DaemonSet  没有可调度节点去运行。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-fallback" data-lang="fallback">kube_daemonset_status_desired_number_scheduled{job=&#34;kube-state-metrics&#34;,namespace=~&#34;.*&#34;}
  - kube_daemonset_status_current_number_scheduled{job=&#34;kube-state-metrics&#34;,namespace=~&#34;.*&#34;}
  &gt; 0
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_daemonset_status_desired_number_scheduled：应该调度的 DaemonSet 数量</li>
<li>kube_daemonset_status_current_number_scheduled:   当前调度的 Daemonset 数量</li>
</ul>
<p><strong>KubeContainerWaiting</strong></p>
<p>监测哪些容器是在等待状态的。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum by <span class="o">(</span>namespace, pod, container<span class="o">)</span> <span class="o">(</span>kube_pod_container_status_waiting_reason<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span><span class="o">})</span> &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>kube_pod_container_status_waiting_reason：容器声明周期过程中的状态，无论是创建成功还是失败都应该是0。</li>
</ul>
<p><strong>KubeJobCompletion</strong></p>
<p>监控那些自动任务没有结束。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"> kube_job_spec_completions<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span>,job_name<span class="o">=</span>~<span class="s2">&#34;.*loki-weixin-notify.*&#34;</span>,namespace<span class="o">=</span>~<span class="s2">&#34;.*&#34;</span><span class="o">}</span>
  - kube_job_status_succeeded<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;kube-state-metrics&#34;</span>,namespace<span class="o">=</span>~<span class="s2">&#34;.*&#34;</span><span class="o">}</span>
  &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="节点相关">节点相关</h3>
<p><strong>NodeClockNotSynchronising</strong></p>
<p>主机与时间服务器失联。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">min_over_time<span class="o">(</span>node_timex_sync_status<span class="o">[</span>5m<span class="o">])</span> <span class="o">==</span> <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_timex_sync_status：同步状态。</li>
</ul>
<p><strong>NodeClockSkewDetected</strong></p>
<p>本地时间偏移量。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span><span class="lnt">6
</span><span class="lnt">7
</span><span class="lnt">8
</span><span class="lnt">9
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>node_timex_offset_seconds &gt; 0.05
        and
          deriv<span class="o">(</span>node_timex_offset_seconds<span class="o">[</span>5m<span class="o">])</span> &gt;<span class="o">=</span> <span class="m">0</span>
        <span class="o">)</span>
        or
        <span class="o">(</span>
          node_timex_offset_seconds &lt; -0.05
        and
          deriv<span class="o">(</span>node_timex_offset_seconds<span class="o">[</span>5m<span class="o">])</span> &lt;<span class="o">=</span> 0<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_timex_offset_seconds：误差</li>
</ul>
<p><strong>NodeHighNumberConntrackEntriesUsed</strong></p>
<p>链接状态跟踪。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>node_nf_conntrack_entries / node_nf_conntrack_entries_limit<span class="o">)</span> &gt; 0.75
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_nf_conntrack_entries：链接状态跟踪表分配的数量</li>
<li>node_nf_conntrack_entries_limit：表总量</li>
</ul>
<p><strong>NodeNetworkReceiveErrs</strong></p>
<p>相关指标：</p>
<ul>
<li>node_network_receive_errs_total：接收错误总量</li>
</ul>
<p><strong>NodeNetworkTransmitErrs</strong></p>
<p>网卡传输错误量。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">increase<span class="o">(</span>node_network_transmit_errs_total<span class="o">[</span>2m<span class="o">])</span> &gt; <span class="m">10</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_network_transmit_errs_total：传输错误总量</li>
</ul>
<p><strong>NodeFilesystemAlmostOutOfFiles</strong></p>
<p>inode 数量监测 表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>
          node_filesystem_files_free<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> / node_filesystem_files<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> * <span class="m">100</span> &lt; <span class="m">5</span>
        and
          node_filesystem_readonly<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> <span class="o">==</span> <span class="m">0</span>
        <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_filesystem_files_free：空闲的 inode</li>
<li>node_filesystem_files：inodes 总量</li>
</ul>
<p><strong>NodeFilesystemFilesFillingUp</strong></p>
<p>inode 耗尽预测，以6小时曲线变化预测接下来24小时和4小时可能使用的 inodes。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>node_filesystem_files_free<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> / node_filesystem_files<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> * <span class="m">100</span> &lt; <span class="m">20</span>
        and
          predict_linear<span class="o">(</span>node_filesystem_files_free<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}[</span>6h<span class="o">]</span>, 4*60*60<span class="o">)</span> &lt; <span class="m">0</span>
        and
          node_filesystem_readonly<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> <span class="o">==</span> 0<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_filesystem_files_free：空闲的 inode</li>
<li>node_filesystem_files：inodes 总量</li>
</ul>
<p><strong>NodeFilesystemAlmostOutOfSpace</strong></p>
<p>分区容量使用率。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>node_filesystem_avail_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> / node_filesystem_size_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> * <span class="m">100</span> &lt; <span class="m">10</span>
        and
          node_filesystem_readonly<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> <span class="o">==</span> <span class="m">0</span>
        <span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_filesystem_avail_bytes：空闲容量</li>
<li>node_filesystem_size_bytes：总容量</li>
</ul>
<p><strong>NodeFilesystemSpaceFillingUp</strong></p>
<p>分区容量耗尽预测，以6小时曲线变化预测接下来24小时和4小时可能使用的容量。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span><span class="lnt">3
</span><span class="lnt">4
</span><span class="lnt">5
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>node_filesystem_avail_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> / node_filesystem_size_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> * <span class="m">100</span> &lt; <span class="m">15</span>
        and
          predict_linear<span class="o">(</span>node_filesystem_avail_bytes<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}[</span>6h<span class="o">]</span>, 4*60*60<span class="o">)</span> &lt; <span class="m">0</span>
        and
          node_filesystem_readonly<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;node-exporter&#34;</span>,fstype!<span class="o">=</span><span class="s2">&#34;&#34;</span><span class="o">}</span> <span class="o">==</span> 0<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>node_filesystem_avail_bytes：空闲容量</li>
<li>node_filesystem_size_bytes：总容量</li>
</ul>
<p><strong>NodeMemAvaliable</strong></p>
<p>物理节点可用内存。 表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>node_memory_MemAvailable_bytes / 1024/1024/1024<span class="o">)</span> by <span class="o">(</span>instance<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>NodeMemUsePresent</strong></p>
<p>物理内存使用率。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="o">(</span>sum<span class="o">((</span>node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Buffers_bytes-node_memory_Cached_bytes-node_memory_Slab_bytes<span class="o">)</span>/node_memory_MemTotal_bytes<span class="o">)</span> by <span class="o">(</span>instance<span class="o">))</span>*100 &gt; <span class="m">80</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>NodeCpuUsePresent</strong></p>
<p>物理cpu使用率。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash"><span class="m">100</span> - <span class="o">(</span>avg by <span class="o">(</span>instance<span class="o">)</span> <span class="o">(</span>irate<span class="o">(</span>node_cpu_seconds_total<span class="o">{</span><span class="nv">mode</span><span class="o">=</span><span class="s2">&#34;idle&#34;</span><span class="o">}[</span>5m<span class="o">]))</span> * 100<span class="o">)</span> &gt; <span class="m">80</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>NodeLoad</strong></p>
<p>物理节点15分钟平均负载load15。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum <span class="o">(</span>node_load15<span class="o">{</span><span class="nv">instance</span><span class="o">=</span>~<span class="s2">&#34;.*15.*&#34;</span><span class="o">})</span> by <span class="o">(</span>instance<span class="o">)</span> &gt; <span class="m">10</span>
sum <span class="o">(</span>node_load15<span class="o">{</span>instance!~<span class="s2">&#34;.*15.*&#34;</span><span class="o">})</span> by <span class="o">(</span>instance<span class="o">)</span> &gt; <span class="m">20</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="etcd相关">Etcd相关</h3>
<p><strong>Etcdlived</strong></p>
<p>etcd 存活检测。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;etcd&#34;</span><span class="o">}</span> &lt; <span class="m">1</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdCluseterUnavailable</strong></p>
<p>etcd 集群健康检查，down 数量大于集群可允许故障数量。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">count<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;etcd&#34;</span><span class="o">}</span> <span class="o">==</span> 0<span class="o">)</span> &gt; <span class="o">(</span>count<span class="o">(</span>up<span class="o">{</span><span class="nv">job</span><span class="o">=</span><span class="s2">&#34;etcd&#34;</span><span class="o">})</span> / <span class="m">2</span> - 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdLeaderCheck</strong></p>
<p>检查 leader。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">max<span class="o">(</span>etcd_server_has_leader<span class="o">)</span> !<span class="o">=</span> <span class="m">1</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdBackendFsync</strong></p>
<p>etcd io 监测，后端提交 延时。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">histogram_quantile<span class="o">(</span>0.99, sum<span class="o">(</span>rate<span class="o">(</span>etcd_disk_backend_commit_duration_seconds_bucket<span class="o">[</span>5m<span class="o">]))</span> by <span class="o">(</span>instance, le<span class="o">))</span> &gt; <span class="m">100</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdWalFsync</strong></p>
<p>etcd io 监测，文件同步到磁盘延时。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">histogram_quantile<span class="o">(</span>0.99, sum<span class="o">(</span>rate<span class="o">(</span>etcd_disk_wal_fsync_duration_seconds_bucket<span class="o">[</span>5m<span class="o">]))</span> by <span class="o">(</span>instance, le<span class="o">))</span> &gt; <span class="m">100</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdDbSize</strong></p>
<p>检测数据库大小。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">etcd_debugging_mvcc_db_total_size_in_bytes/1024/1024 &gt; <span class="m">1024</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>EtcdGrpc</strong></p>
<p>Grpc 调用速率。表达式:</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>rate<span class="o">(</span>grpc_server_handled_total<span class="o">{</span><span class="nv">grpc_type</span><span class="o">=</span><span class="s2">&#34;unary&#34;</span><span class="o">}[</span>1m<span class="o">]))</span> &gt; <span class="m">100</span>
</code></pre></td></tr></table>
</div>
</div><h3 id="coredns-相关">CoreDNS 相关</h3>
<p><strong>DnsRequest</strong></p>
<p>DNS 查询速率，每分钟查询超过100告警。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>irate<span class="o">(</span>coredns_dns_request_count_total<span class="o">{</span>zone !<span class="o">=</span><span class="s2">&#34;dropped&#34;</span><span class="o">}[</span>1m<span class="o">]))</span> &gt; <span class="m">100</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>coredns_dns_request_count_total：总查询数</li>
</ul>
<p><strong>DnsRequestFaild</strong></p>
<p>异常查询，异常状态码，不是 NOERROR。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">irate<span class="o">(</span>coredns_dns_response_rcode_count_total<span class="o">{</span>rcode!<span class="o">=</span><span class="s2">&#34;NOERROR&#34;</span><span class="o">}</span> <span class="o">[</span>1m<span class="o">])</span> &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p>相关指标：</p>
<ul>
<li>coredns_dns_response_rcode_count_total：查询返回状态码</li>
</ul>
<p>DNS-Rcode：</p>
<p>DNS-Rcode 作为 DNS 应答报文中有效的字段，主要用来说明 DNS 应答状态，是排查<a href="https://cloud.tencent.com/product/cns?from=10680" target="_blank" rel="noopener noreffer">域名解析</a>失败的重要指标。通常常见的 Rcode 值如下：</p>
<ul>
<li>Rcode 值为0，对应的 DNS 应答状态为 NOERROR，意思是成功的响应，即这个域名解析是成功</li>
<li>Rcode 值为2，对应的 DNS 应答状态为 SERVFAIL，意思是服务器失败，也就是这个域名的权威服务器拒绝响应或者响应 REFUSE，递归服务器返回 Rcode 值为 2 给 CLIENT</li>
<li>Rcode 值为3，对应的 DNS 应答状态为 NXDOMAIN，意思是不存在的记录，也就是这个具体的域名在权威服务器中并不存在</li>
<li>Rcode 值为5，对应的 DNS 应答状态为 REFUSE，意思是拒绝，也就是这个请求源IP不在服务的范围内</li>
</ul>
<p><strong>DnsPanic</strong></p>
<p>DNS 恐慌值，可能收到攻击。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-b" data-lang="b"><span class="c">irate(coredns_panic_count_total</span><span class="k">[</span><span class="c">1m</span><span class="k">]</span><span class="c">) </span><span class="nv">&gt;</span><span class="c"> 100
</span></code></pre></td></tr></table>
</div>
</div><h3 id="rabbitmq-相关">RabbitMq 相关</h3>
<p><strong>RabbitmqDown</strong></p>
<p>监控 rabbitmq 服务，down 或者网络出现问题。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">absent<span class="o">(</span><span class="nv">rabbitmq_up</span> <span class="o">==</span> 1<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>RabbitmqMessages</strong></p>
<p>监控那些quese的message数量大于250，监控那些quese的message数量大于500。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span><span class="lnt">2
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">rabbitmq_queue_messages &gt; <span class="m">250</span>
rabbitmq_queue_messages &gt; <span class="m">500</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>RabbitmqNotRunning</strong></p>
<p>监控rabbitmq那些节点已经停止运行。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">rabbitmq_running !<span class="o">=</span> <span class="m">1</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>RabbitmqQueueUnacknowledged</strong></p>
<p>监控quese中消息未应答的数量，表达式</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">rabbitmq_queue_messages_unacknowledged &gt; <span class="m">0</span>
</code></pre></td></tr></table>
</div>
</div><p><strong>RabbitmqNodeDiskFree</strong></p>
<p>监控rabbitmq可使用的磁盘空间大于5G的节点。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>rabbitmq_node_disk_free /1024/1024/1024<span class="o">)</span> by <span class="o">(</span>node<span class="o">)</span> &gt;5
</code></pre></td></tr></table>
</div>
</div><p><strong>RabbitmqFreeUsed</strong></p>
<p>监控rabbitmq 使用的物理内存。表达式：</p>
<div class="highlight"><div class="chroma">
<table class="lntable"><tr><td class="lntd">
<pre class="chroma"><code><span class="lnt">1
</span></code></pre></td>
<td class="lntd">
<pre class="chroma"><code class="language-bash" data-lang="bash">sum<span class="o">(</span>rabbitmq_node_mem_used /1024/1024<span class="o">)</span> by <span class="o">(</span>node<span class="o">)</span>
</code></pre></td></tr></table>
</div>
</div></div><div id="comments"></div></div></div>
            </main><footer class="footer">
        <div class="footer-container"><div class="footer-line"><i class="far fa-copyright fa-fw"></i><span itemprop="copyrightYear">2019 - 2021</span><span class="author" itemprop="copyrightHolder">&nbsp;<a href="/" target="_blank">mikel pan</a></span>&nbsp;|&nbsp;<span class="license"><a rel="license external nofollow noopener noreffer" href="https://creativecommons.org/licenses/by-nc/4.0/" target="_blank">CC BY-NC 4.0</a></span><span class="icp-splitter">&nbsp;|&nbsp;</span><br class="icp-br"/>
                    <span class="icp"><a href="https://beian.miit.gov.cn/" target="_blank">粤ICP备2021047442号</a></span></div>
        </div>
    </footer></div>

        <div id="fixed-buttons"><a href="#" id="back-to-top" class="fixed-button" title="回到顶部">
                <i class="fas fa-arrow-up fa-fw"></i>
            </a><a href="#" id="view-comments" class="fixed-button" title="查看评论">
                <i class="fas fa-comment fa-fw"></i>
            </a>
        </div><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/lightgallery.js@1.2.0/dist/css/lightgallery.min.css"><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/smooth-scroll@16.1.3/dist/smooth-scroll.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/autocomplete.js@0.37.1/dist/autocomplete.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/algoliasearch@4.2.0/dist/algoliasearch-lite.umd.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/lazysizes@5.2.2/lazysizes.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/lightgallery.js@1.2.0/dist/js/lightgallery.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/lg-thumbnail.js@1.2.0/dist/lg-thumbnail.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/lg-zoom.js@1.2.0/dist/lg-zoom.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/clipboard@2.0.6/dist/clipboard.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/sharer.js@0.4.0/sharer.min.js"></script><script type="text/javascript">window.config={"code":{"copyTitle":"复制到剪贴板","maxShownLines":10},"comment":{},"lightGallery":{"actualSize":false,"exThumbImage":"data-thumbnail","hideBarsDelay":2000,"selector":".lightgallery","speed":400,"thumbContHeight":80,"thumbWidth":80,"thumbnail":true},"search":{"algoliaAppID":"REQJX89W85","algoliaIndex":"index.zh-cn","algoliaSearchKey":"63fa048de9b35627f46672e95abc14df","highlightTag":"em","maxResultLength":10,"noResultsFound":"没有找到结果","snippetLength":50,"type":"algolia"}};</script><script type="text/javascript" src="/js/theme.min.js"></script></body>
</html>
